import polars as pl
from xtokenizer import BaseTokenizer
from xtokenizer.utils import convert_labels, get_text_length, get_df_text_labels

file = "/Users/summy/project/python/parttime/归档/text_gcn/data/北方地区不安全事件统计20240331.csv"

if __name__ == '__main__':
    df = pl.read_csv(file, encoding='GBK')
    texts, labels, classes = get_df_text_labels(df, text_col='故障描述', label_col='故障标志')
    tokenizer = BaseTokenizer(texts=df['故障描述'].to_list())
    print(tokenizer.vocab[:10])
    